1. Introduction

1.1 Read in Breast cancer dataset and libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(naniar)

BC_data <- read_tsv("GSE96058_expression_matrix_3decimals.tsv")
## Rows: 30865 Columns: 3410
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr    (1): Unnamed: 0
## dbl (3409): F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1.2 Read in textfile dataset

textfile <- read_tsv("GSE96058_metadata.txt")
## Rows: 3069 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): sample_id, title, geo_accession, type, source_name_ch1, instrument...
## dbl (19): channel_count, age at diagnosis:ch1, chemo treated:ch1, endocrine ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(textfile)
##   sample_id            title           geo_accession          type          
##  Length:3069        Length:3069        Length:3069        Length:3069       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  channel_count source_name_ch1    age at diagnosis:ch1 chemo treated:ch1
##  Min.   :1     Length:3069        Min.   :24.00        Min.   :0.0000   
##  1st Qu.:1     Class :character   1st Qu.:53.00        1st Qu.:0.0000   
##  Median :1     Mode  :character   Median :64.00        Median :0.0000   
##  Mean   :1                        Mean   :62.76        Mean   :0.4072   
##  3rd Qu.:1                        3rd Qu.:71.00        3rd Qu.:1.0000   
##  Max.   :1                        Max.   :96.00        Max.   :1.0000   
##                                                        NA's   :21       
##  endocrine treated:ch1 er prediction mgc:ch1 er prediction sgc:ch1
##  Min.   :0.0000        Min.   :0.0000        Min.   :0.0000       
##  1st Qu.:1.0000        1st Qu.:1.0000        1st Qu.:1.0000       
##  Median :1.0000        Median :1.0000        Median :1.0000       
##  Mean   :0.7765        Mean   :0.7983        Mean   :0.8866       
##  3rd Qu.:1.0000        3rd Qu.:1.0000        3rd Qu.:1.0000       
##  Max.   :1.0000        Max.   :1.0000        Max.   :1.0000       
##  NA's   :22                                                       
##  er status:ch1   her2 prediction mgc:ch1 her2 prediction sgc:ch1
##  Min.   :0.000   Min.   :0.0000          Min.   :0.00000        
##  1st Qu.:1.000   1st Qu.:0.0000          1st Qu.:0.00000        
##  Median :1.000   Median :0.0000          Median :0.00000        
##  Mean   :0.922   Mean   :0.1297          Mean   :0.09286        
##  3rd Qu.:1.000   3rd Qu.:0.0000          3rd Qu.:0.00000        
##  Max.   :1.000   Max.   :1.0000          Max.   :1.00000        
##  NA's   :199                                                    
##  her2 status:ch1  instrument model:ch1 ki67 prediction mgc:ch1
##  Min.   :0.0000   Length:3069          Min.   :0.0000         
##  1st Qu.:0.0000   Class :character     1st Qu.:0.0000         
##  Median :0.0000   Mode  :character     Median :0.0000         
##  Mean   :0.1323                        Mean   :0.3001         
##  3rd Qu.:0.0000                        3rd Qu.:1.0000         
##  Max.   :1.0000                        Max.   :1.0000         
##  NA's   :105                                                  
##  ki67 prediction sgc:ch1 ki67 status:ch1  lymph node group:ch1
##  Min.   :0.0000          Min.   :0.0000   Length:3069         
##  1st Qu.:0.0000          1st Qu.:0.0000   Class :character    
##  Median :1.0000          Median :1.0000   Mode  :character    
##  Mean   :0.5888          Mean   :0.5862                       
##  3rd Qu.:1.0000          3rd Qu.:1.0000                       
##  Max.   :1.0000          Max.   :1.0000                       
##                          NA's   :1682                         
##  lymph node status:ch1 nhg prediction mgc:ch1   nhg:ch1         
##  Length:3069           Length:3069            Length:3069       
##  Class :character      Class :character       Class :character  
##  Mode  :character      Mode  :character       Mode  :character  
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  overall survival days:ch1 overall survival event:ch1 pam50 subtype:ch1 
##  Min.   :  56              Min.   :0.0000             Length:3069       
##  1st Qu.:1236              1st Qu.:0.0000             Class :character  
##  Median :1612              Median :0.0000             Mode  :character  
##  Mean   :1606              Mean   :0.1049                               
##  3rd Qu.:2004              3rd Qu.:0.0000                               
##  Max.   :2474              Max.   :1.0000                               
##                                                                         
##  pgr prediction mgc:ch1 pgr prediction sgc:ch1 pgr status:ch1  
##  Min.   :0.0000         Min.   :0.0000         Min.   :0.0000  
##  1st Qu.:1.0000         1st Qu.:1.0000         1st Qu.:1.0000  
##  Median :1.0000         Median :1.0000         Median :1.0000  
##  Mean   :0.7739         Mean   :0.7758         Mean   :0.8678  
##  3rd Qu.:1.0000         3rd Qu.:1.0000         3rd Qu.:1.0000  
##  Max.   :1.0000         Max.   :1.0000         Max.   :1.0000  
##                                                NA's   :330     
##  scan-b external id:ch1 tumor size:ch1  
##  Length:3069            Min.   :  0.00  
##  Class :character       1st Qu.: 12.00  
##  Mode  :character       Median : 17.00  
##                         Mean   : 19.98  
##                         3rd Qu.: 24.00  
##                         Max.   :126.00  
##                         NA's   :32

1.3 Make merged dataset

BC_data_new <- column_to_rownames(BC_data, var = "Unnamed: 0")
BC_data_new2 <- t(BC_data_new)
BC_data_new2 <- as.data.frame(t(BC_data_new))
BC_data_new2 <- rownames_to_column(BC_data_new2, var = "title")
merged_data_2 <- left_join(textfile, BC_data_new2, by = "title")

2. Preprocessing

2.1 Missing-value summary for the metadata

reduced_merged_data <- merged_data_2 %>%
  select(`overall survival days:ch1`, `overall survival event:ch1`, `ABHD11`, `ESR1`, `chemo treated:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `er status:ch1`, `her2 status:ch1`, `pgr status:ch1`, `endocrine treated:ch1`)

reduced_merged_data$`chemo treated:ch1`<- as.factor(reduced_merged_data$`chemo treated:ch1`)
reduced_merged_data$`overall survival event:ch1`<- as.factor(reduced_merged_data$`overall survival event:ch1`)
reduced_merged_data$`er status:ch1`<- as.factor(reduced_merged_data$`er status:ch1`)
reduced_merged_data$`her2 status:ch1`<- as.factor(reduced_merged_data$`her2 status:ch1`)
reduced_merged_data$`pgr status:ch1`<- as.factor(reduced_merged_data$`pgr status:ch1`)
reduced_merged_data$`endocrine treated:ch1` <- as.factor(reduced_merged_data$`endocrine treated:ch1`)

print(reduced_merged_data)
## # A tibble: 3,069 × 11
##    `overall survival days:ch1` `overall survival event:ch1` ABHD11  ESR1
##                          <dbl> <fct>                         <dbl> <dbl>
##  1                        2367 0                              4.13 0.962
##  2                        2367 0                              4.34 6.17 
##  3                        2168 1                              5.27 7.98 
##  4                        2416 0                              5.87 6.05 
##  5                        2389 0                              3.32 5.50 
##  6                        2373 0                              4.51 0.263
##  7                        2380 0                              6.47 1.14 
##  8                        2324 0                              5.66 5.97 
##  9                        2367 0                              4.50 6.54 
## 10                        1962 1                              5.86 7.57 
## # ℹ 3,059 more rows
## # ℹ 7 more variables: `chemo treated:ch1` <fct>, `age at diagnosis:ch1` <dbl>,
## #   `tumor size:ch1` <dbl>, `er status:ch1` <fct>, `her2 status:ch1` <fct>,
## #   `pgr status:ch1` <fct>, `endocrine treated:ch1` <fct>

2.1.1 Heatmap of NAs

metadata_missing <- reduced_merged_data %>%
  select(where(~ any(is.na(.))))
vis_miss(reduced_merged_data, sort_miss = TRUE) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    axis.title.x = element_blank()
  ) +
  labs(title = "Missing Data in Metadata")

location_missing_data_chemo <- textfile %>%
  filter(is.na(`chemo treated:ch1`)) %>%
  select(sample_id, `er status:ch1`, `pgr status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `tumor size:ch1`, `age at diagnosis:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`)

location_missing_data_tumor_size <- textfile %>%
  filter(is.na(`tumor size:ch1`)) %>%
  select(sample_id, `er status:ch1`, `pgr status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)

location_missing_data_tumor_size <- textfile %>%
  filter(is.na(`tumor size:ch1`)) %>%
  select(sample_id, `er status:ch1`, `pgr status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)

location_missing_data_her2 <- textfile %>%
  filter(is.na(`her2 status:ch1`)) %>%
  select(sample_id, `er status:ch1`, `pgr status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)

location_missing_data_er <- textfile %>%
  filter(is.na(`er status:ch1`)) %>%
  select(sample_id, `pgr status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)

location_missing_data_pgr <- textfile %>%
  filter(is.na(`pgr status:ch1`)) %>%
  select(sample_id, `er status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)

location_missing_data_endo <- textfile %>%
  filter(is.na(`endocrine treated:ch1`)) %>%
  select(sample_id, `er status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)

print(location_missing_data_chemo)
## # A tibble: 21 × 12
##    sample_id  `er status:ch1` `pgr status:ch1` `her2 status:ch1`
##    <chr>                <dbl>            <dbl>             <dbl>
##  1 GSM2528173               1                1                 0
##  2 GSM2528236              NA               NA                 0
##  3 GSM2528339               1                1                 0
##  4 GSM2528393               1                1                 0
##  5 GSM2528463               0                0                 0
##  6 GSM2528524              NA               NA                 0
##  7 GSM2528605               1                1                NA
##  8 GSM2528973               0                0                 0
##  9 GSM2529367               1                1                 0
## 10 GSM2529380               0                0                 1
## # ℹ 11 more rows
## # ℹ 8 more variables: `ki67 status:ch1` <dbl>, `nhg:ch1` <chr>,
## #   `lymph node group:ch1` <chr>, `lymph node status:ch1` <chr>,
## #   `tumor size:ch1` <dbl>, `age at diagnosis:ch1` <dbl>,
## #   `overall survival event:ch1` <dbl>, `endocrine treated:ch1` <dbl>
print(location_missing_data_tumor_size)
## # A tibble: 32 × 12
##    sample_id  `er status:ch1` `pgr status:ch1` `her2 status:ch1`
##    <chr>                <dbl>            <dbl>             <dbl>
##  1 GSM2528105              NA               NA                 1
##  2 GSM2528317               0                0                 1
##  3 GSM2528319               1                1                 1
##  4 GSM2528520               1                0                 0
##  5 GSM2528579               1                0                 0
##  6 GSM2528584              NA               NA                 0
##  7 GSM2528746              NA               NA                 0
##  8 GSM2528901               1                1                 0
##  9 GSM2528950               1                1                 0
## 10 GSM2528953               1                1                 0
## # ℹ 22 more rows
## # ℹ 8 more variables: `ki67 status:ch1` <dbl>, `nhg:ch1` <chr>,
## #   `lymph node group:ch1` <chr>, `lymph node status:ch1` <chr>,
## #   `age at diagnosis:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## #   `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>
print(location_missing_data_her2)
## # A tibble: 105 × 12
##    sample_id  `er status:ch1` `pgr status:ch1` `ki67 status:ch1` `nhg:ch1`
##    <chr>                <dbl>            <dbl>             <dbl> <chr>    
##  1 GSM2528129               1                1                NA G3       
##  2 GSM2528213               1                1                NA G1       
##  3 GSM2528233               1                1                NA G1       
##  4 GSM2528322               1                1                NA G2       
##  5 GSM2528514               0                0                 1 <NA>     
##  6 GSM2528521               1                1                NA <NA>     
##  7 GSM2528605               1                1                 1 G3       
##  8 GSM2528616               1                1                 0 G3       
##  9 GSM2528617               1                1                 0 G2       
## 10 GSM2528620               1                0                 1 G2       
## # ℹ 95 more rows
## # ℹ 7 more variables: `lymph node group:ch1` <chr>,
## #   `lymph node status:ch1` <chr>, `age at diagnosis:ch1` <dbl>,
## #   `tumor size:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## #   `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>
print(location_missing_data_er)
## # A tibble: 199 × 12
##    sample_id  `pgr status:ch1` `her2 status:ch1` `ki67 status:ch1` `nhg:ch1`
##    <chr>                 <dbl>             <dbl>             <dbl> <chr>    
##  1 GSM2528079               NA                 0                NA G3       
##  2 GSM2528084               NA                 0                NA G3       
##  3 GSM2528093               NA                 0                NA G3       
##  4 GSM2528094               NA                 0                NA G3       
##  5 GSM2528105               NA                 1                NA G3       
##  6 GSM2528106               NA                 1                NA G3       
##  7 GSM2528107               NA                 1                NA G3       
##  8 GSM2528121               NA                 1                NA G3       
##  9 GSM2528126               NA                 0                NA G3       
## 10 GSM2528127               NA                 0                NA G3       
## # ℹ 189 more rows
## # ℹ 7 more variables: `lymph node group:ch1` <chr>,
## #   `lymph node status:ch1` <chr>, `age at diagnosis:ch1` <dbl>,
## #   `tumor size:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## #   `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>
print(location_missing_data_pgr)
## # A tibble: 330 × 12
##    sample_id  `er status:ch1` `her2 status:ch1` `ki67 status:ch1` `nhg:ch1`
##    <chr>                <dbl>             <dbl>             <dbl> <chr>    
##  1 GSM2528079              NA                 0                NA G3       
##  2 GSM2528082               1                 1                NA G3       
##  3 GSM2528083               1                 0                NA G2       
##  4 GSM2528084              NA                 0                NA G3       
##  5 GSM2528087               1                 1                NA G1       
##  6 GSM2528088               1                 1                NA G3       
##  7 GSM2528093              NA                 0                NA G3       
##  8 GSM2528094              NA                 0                NA G3       
##  9 GSM2528101               1                 0                NA G2       
## 10 GSM2528105              NA                 1                NA G3       
## # ℹ 320 more rows
## # ℹ 7 more variables: `lymph node group:ch1` <chr>,
## #   `lymph node status:ch1` <chr>, `age at diagnosis:ch1` <dbl>,
## #   `tumor size:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## #   `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>
print(location_missing_data_endo)
## # A tibble: 22 × 12
##    sample_id  `er status:ch1` `her2 status:ch1` `ki67 status:ch1` `nhg:ch1`
##    <chr>                <dbl>             <dbl>             <dbl> <chr>    
##  1 GSM2528173               1                 0                 0 G1       
##  2 GSM2528236              NA                 0                NA G3       
##  3 GSM2528339               1                 0                 0 G1       
##  4 GSM2528393               1                 0                 0 G1       
##  5 GSM2528463               0                 0                 1 G3       
##  6 GSM2528524              NA                 0                 1 G3       
##  7 GSM2528605               1                NA                 1 G3       
##  8 GSM2528973               0                 0                 1 G2       
##  9 GSM2529367               1                 0                 0 <NA>     
## 10 GSM2529380               0                 1                 1 G3       
## # ℹ 12 more rows
## # ℹ 7 more variables: `lymph node group:ch1` <chr>,
## #   `lymph node status:ch1` <chr>, `age at diagnosis:ch1` <dbl>,
## #   `tumor size:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## #   `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>

2.2.2 Processing NA’s

mode_level_her2 <- names(which.max(table(reduced_merged_data$`her2 status:ch1`)))

mode_level_tumor_size <- names(which.max(table(reduced_merged_data$`tumor size:ch1`)))

mode_level_chemo <- names(which.max(table(reduced_merged_data$`chemo treated:ch1`)))

mode_level_er <- names(which.max(table(reduced_merged_data$`er status:ch1`)))

mode_level_pgr <- names(which.max(table(reduced_merged_data$`pgr status:ch1`)))

mode_level_endo <- names(which.max(table(reduced_merged_data$`endocrine treated:ch1`)))

no_NA_reduced_merged_data <- reduced_merged_data %>%
  mutate(
    `chemo treated:ch1` = replace(`chemo treated:ch1`,
                                  is.na(`chemo treated:ch1`),
                                  mode_level_chemo),
    `er status:ch1` = replace(`er status:ch1`,
                              is.na(`er status:ch1`),
                              mode_level_er),
    `pgr status:ch1` = replace(`pgr status:ch1`,
                               is.na(`pgr status:ch1`),
                               mode_level_pgr),
    `endocrine treated:ch1` = replace(`endocrine treated:ch1`,
                                  is.na(`endocrine treated:ch1`),
                                  mode_level_endo)
  )

no_NA_reduced_merged_data <- no_NA_reduced_merged_data %>%
  drop_na(`her2 status:ch1`) %>%
  drop_na(`tumor size:ch1`)

print(no_NA_reduced_merged_data)
## # A tibble: 2,938 × 11
##    `overall survival days:ch1` `overall survival event:ch1` ABHD11  ESR1
##                          <dbl> <fct>                         <dbl> <dbl>
##  1                        2367 0                              4.13 0.962
##  2                        2367 0                              4.34 6.17 
##  3                        2168 1                              5.27 7.98 
##  4                        2416 0                              5.87 6.05 
##  5                        2389 0                              3.32 5.50 
##  6                        2373 0                              4.51 0.263
##  7                        2380 0                              6.47 1.14 
##  8                        2324 0                              5.66 5.97 
##  9                        2367 0                              4.50 6.54 
## 10                        1962 1                              5.86 7.57 
## # ℹ 2,928 more rows
## # ℹ 7 more variables: `chemo treated:ch1` <fct>, `age at diagnosis:ch1` <dbl>,
## #   `tumor size:ch1` <dbl>, `er status:ch1` <fct>, `her2 status:ch1` <fct>,
## #   `pgr status:ch1` <fct>, `endocrine treated:ch1` <fct>

3. Univariate hypothesis

3.1 Univariate hypothesis preprocessing of data

3.1.1 Outliers

library(dplyr)

Q1 <- quantile(no_NA_reduced_merged_data$ABHD11, 0.25, na.rm = TRUE)
Q3 <- quantile(no_NA_reduced_merged_data$ABHD11, 0.75, na.rm = TRUE)
IQR_value <- IQR(no_NA_reduced_merged_data$ABHD11, na.rm = TRUE)

lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

outliers <- no_NA_reduced_merged_data %>%
  filter(ABHD11 < lower_bound | ABHD11 > upper_bound)

num_outliers <- nrow(outliers)
print(paste("Number of outliers:", num_outliers))
## [1] "Number of outliers: 54"
print(num_outliers)
## [1] 54

3.1.2 Boxplot with outlier detection

ggplot_outlier_detection <- ggplot(no_NA_reduced_merged_data, aes(y = ABHD11, x = 1)) +
  
  geom_jitter(
    width = 0.2,
    alpha = 0.3,
    color = "grey50"
  ) +
  
  geom_boxplot(
    width = 0.3,
    outlier.color = "black",
    outlier.size = 1.5,
    fill = NA,
    color = "black"
  ) +
  
  labs(
    title = "ABHD11 expression",
    x = "",
    y = "ABHD11 expression"
  ) +
  
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(face = "bold")
  )

print(ggplot_outlier_detection)

3.1.3 Histogram of the ABHD11 gene

library(ggplot2)

Histogram_ABHD11 <- ggplot(no_NA_reduced_merged_data, aes(x = ABHD11)) +
  geom_histogram(
    bins = 60,
    fill = "steelblue",
    color = "black",
    alpha = 0.7
  ) +
  labs(
    title = "Histogram of ABHD11",
    x = "ABHD11 expression",
    y = "Frequency"
  ) +
  theme_minimal(base_size = 14)

print(Histogram_ABHD11)

3.1.4 Scatterplot to assess linearity between ABHD11 and tumor size

Scatterplot_ABHD11 <- ggplot(no_NA_reduced_merged_data, 
       aes(x = ABHD11, y = `tumor size:ch1`)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess", se = TRUE, color = "blue") +
  labs(
    x = "ABHD11 expression (log-transformed)",
    y = "Tumor size (cm)",
    title = "Scatterplot of ABHD11 Expression vs Tumor Size"
  ) +
  theme_minimal()

print(Scatterplot_ABHD11)
## `geom_smooth()` using formula = 'y ~ x'

3.2 Univariate hypothesis test with restriction on tumor size

3.2.1 FOR SMALL TUMORS

no_NA_reduced_merged_data$`tumor size:ch1` <- as.numeric(no_NA_reduced_merged_data$`tumor size:ch1`)
# Stap 1: filter for small tumors (<2 cm)
small_tumors <- no_NA_reduced_merged_data[no_NA_reduced_merged_data$`tumor size:ch1` < 2, ]

3.2.1.1 Spearman correlation

spearman_test <- cor.test(
  small_tumors$ABHD11,
  small_tumors$`tumor size:ch1`,
  method = "spearman"
)
## Warning in cor.test.default(small_tumors$ABHD11, small_tumors$`tumor size:ch1`,
## : Cannot compute exact p-value with ties
print(spearman_test)
## 
##  Spearman's rank correlation rho
## 
## data:  small_tumors$ABHD11 and small_tumors$`tumor size:ch1`
## S = 530.72, p-value = 0.5696
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1664101

3.2.1.2 Scatter plot with loess trend line

Scatterplot_loess <- ggplot(small_tumors, aes(x = ABHD11, y = `tumor size:ch1`)) +
  geom_point() +
  geom_smooth(method = "loess", color = "blue") +
  labs(
    x = "ABHD11 expression (log-transformed)",
    y = "Tumor size (cm)",
    title = "Scatterplot: ABHD11 vs Tumor Size (<2 cm)"
  ) +
  theme_minimal()

print(Scatterplot_loess)
## `geom_smooth()` using formula = 'y ~ x'

3.2.2 FOR ALL TUMORS

3.2.2.1 Spearman correlation

spearman_test <- cor.test(
  no_NA_reduced_merged_data$ABHD11,
  no_NA_reduced_merged_data$`tumor size:ch1`,
  method = "spearman"
)
## Warning in cor.test.default(no_NA_reduced_merged_data$ABHD11,
## no_NA_reduced_merged_data$`tumor size:ch1`, : Cannot compute exact p-value with
## ties
print(spearman_test)
## 
##  Spearman's rank correlation rho
## 
## data:  no_NA_reduced_merged_data$ABHD11 and no_NA_reduced_merged_data$`tumor size:ch1`
## S = 3725756477, p-value = 1.161e-10
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1185242

3.2.2.2 Scatter plot with loess trend line

library(ggplot2)

Scatterplot_3 <- ggplot(no_NA_reduced_merged_data, aes(x = ABHD11, y = `tumor size:ch1`)) +
  geom_point() +
  geom_smooth(method = "loess", color = "blue") +
  labs(
    x = "ABHD11 expression (log-transformed)",
    y = "Tumor size (cm)",
    title = "Scatterplot: ABHD11 vs Tumor Size"
  ) +
  theme_minimal()

print(Scatterplot_3)
## `geom_smooth()` using formula = 'y ~ x'

4. Multivariate hypothesis

4.1 t-SNE

library(matrixStats)
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
library(Rtsne)

set.seed(42)  

4.1.1 Convert to numeric properly

BC_data_asdata <- as.data.frame(BC_data_new)

4.1.2 Remove whitespace and convert all columns to numeric

BC_data_asdata[] <- lapply(BC_data_asdata, function(x) as.numeric(trimws(x)))

4.1.3 Ensure BC_data is a numeric matrix. BC_data must be: rows = genes, columns = samples

BC_data_matrix <- as.matrix(BC_data_asdata)

4.1.4 Compute variance per gene (FAST)

gene_var <- rowVars(BC_data_matrix)              # numeric vector of variances
names(gene_var) <- rownames(BC_data_matrix)      # label variance values with gene names

4.1.5 Select top 1000 most variable genes

n_top <- min(1000, nrow(BC_data_matrix))         # if fewer than 1000 genes, adjust automatically
top_genes <- names(sort(gene_var, decreasing = TRUE))[1:n_top]

4.1.6 Subset the expression matrix to top genes

expr_top <- BC_data_matrix[top_genes, ]          # still genes x samples

4.1.7 Transpose to samples x genes (t-SNE expects samples as rows)

expr_top_t <- t(expr_top)

4.1.8 Scale expression values (z-score per gene)

expr_scaled <- scale(expr_top_t)

4.1.9 Run t-SNE (2 dimensions)

tsne_res <- Rtsne(
  expr_scaled,
  dims = 2,
  perplexity = 30,    # adjust depending on sample size
  verbose = TRUE,
  pca = FALSE
)
## Read the 3409 x 1000 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 22.49 seconds (sparsity = 0.047856)!
## Learning embedding...
## Iteration 50: error is 84.638146 (50 iterations in 1.05 seconds)
## Iteration 100: error is 83.534885 (50 iterations in 0.97 seconds)
## Iteration 150: error is 84.504285 (50 iterations in 0.87 seconds)
## Iteration 200: error is 82.771476 (50 iterations in 1.13 seconds)
## Iteration 250: error is 82.853195 (50 iterations in 1.21 seconds)
## Iteration 300: error is 2.964818 (50 iterations in 0.80 seconds)
## Iteration 350: error is 2.816146 (50 iterations in 0.63 seconds)
## Iteration 400: error is 2.763845 (50 iterations in 0.63 seconds)
## Iteration 450: error is 2.738270 (50 iterations in 0.63 seconds)
## Iteration 500: error is 2.725157 (50 iterations in 0.63 seconds)
## Iteration 550: error is 2.718625 (50 iterations in 0.63 seconds)
## Iteration 600: error is 2.714560 (50 iterations in 0.63 seconds)
## Iteration 650: error is 2.711677 (50 iterations in 0.63 seconds)
## Iteration 700: error is 2.708887 (50 iterations in 0.63 seconds)
## Iteration 750: error is 2.706729 (50 iterations in 0.64 seconds)
## Iteration 800: error is 2.705620 (50 iterations in 0.63 seconds)
## Iteration 850: error is 2.704790 (50 iterations in 0.63 seconds)
## Iteration 900: error is 2.703746 (50 iterations in 0.63 seconds)
## Iteration 950: error is 2.702862 (50 iterations in 0.64 seconds)
## Iteration 1000: error is 2.701858 (50 iterations in 0.63 seconds)
## Fitting performed in 14.89 seconds.
summary(tsne_res)
##                     Length Class  Mode   
## N                      1   -none- numeric
## Y                   6818   -none- numeric
## costs               3409   -none- numeric
## itercosts             20   -none- numeric
## origD                  1   -none- numeric
## perplexity             1   -none- numeric
## theta                  1   -none- numeric
## max_iter               1   -none- numeric
## stop_lying_iter        1   -none- numeric
## mom_switch_iter        1   -none- numeric
## momentum               1   -none- numeric
## final_momentum         1   -none- numeric
## eta                    1   -none- numeric
## exaggeration_factor    1   -none- numeric

4.1.10 Create a dataframe with t-SNE coordinates

tsne_df <- data.frame(
  Sample = rownames(expr_scaled),
  TSNE1  = tsne_res$Y[,1],
  TSNE2  = tsne_res$Y[,2],
  stringsAsFactors = FALSE
)

head(tsne_df)

4.1.11 Create a scatter plot for the t_SNE

Scatter_tSNE <- ggplot(tsne_df, aes(x = TSNE1, y = TSNE2)) +
  geom_point(color = "steelblue", size = 2, alpha = 0.7) +  # punten
  theme_minimal() +                                         # nette achtergrond
  labs(
    title = "t-SNE 2-Dimensional Digit Visualization",
    x = "t-SNE Dimension 1",
    y = "t-SNE Dimension 2"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.title = element_text(size = 14)
  )

print(Scatter_tSNE)

4.1.12 Merge t-SNE coordinates with merged_data on sample ID (F1, F2, F3,…)

tsne_merged <- merge(
  tsne_df,     
  merged_data_2,       
  by.x = "Sample",  
  by.y = "title",      
  all.x = TRUE
)

4.1.13 Create plots for the clinical variables of interest

4.1.13.1 PLOT FOR CHEMOTHERAPY

Plot_chemo <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`chemo treated:ch1`))) +
  geom_point(size = 2, alpha = 0.8) +
  scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
  theme_minimal() +
  labs(title = "t-SNE colored by Chemo Treatment",
       x = "t-SNE 1", y = "t-SNE 2")

print(Plot_chemo)

4.1.13.2 PLOT FOR SURVIVAL EVENT

plot_surv <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`overall survival event:ch1`))) +
  geom_point(size = 2, alpha = 0.8) +
  scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
  theme_minimal() +
  labs(title = "t-SNE colored by overall survival event",
       x = "t-SNE 1", y = "t-SNE 2")

print(plot_surv)

4.1.13.3 PLOT FOR ER STATUS

plot_er <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`er status:ch1`))) +
  geom_point(size = 2, alpha = 0.8) +
  scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
  theme_minimal() +
  labs(title = "t-SNE colored by ER status",
       x = "t-SNE 1", y = "t-SNE 2")

print(plot_er)

4.1.13.4 PLOT FOR HER2 STATUS

plot_her2 <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`her2 status:ch1`))) +
  geom_point(size = 2, alpha = 0.8) +
  scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
  theme_minimal() +
  labs(title = "t-SNE colored by HER2 status",
       x = "t-SNE 1", y = "t-SNE 2")

print(plot_her2)

4.1.13.5 PLOT FOR PGR STATUS

plot_pgr <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`pgr status:ch1`))) +
  geom_point(size = 2, alpha = 0.8) +
  scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
  theme_minimal() +
  labs(title = "t-SNE colored by PGR status",
       x = "t-SNE 1", y = "t-SNE 2")

print(plot_pgr)

4.1.13.6 PLOT FOR AGE AT DIAGNOSIS

plot_aad <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = `age at diagnosis:ch1`)) +
  geom_point(size = 2, alpha = 0.8) +
  scale_color_viridis_c(option = "plasma") +
  theme_minimal() +
  labs(title = "t-SNE colored by Age at Diagnosis",
       x = "t-SNE 1", y = "t-SNE 2")

print(plot_aad)

4.1.13.7 PLOT FOR OVERALL SURVIVAL DAYS

plot_osd <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = `overall survival days:ch1`)) +
  geom_point(size = 2, alpha = 0.8) +
  scale_color_viridis_c(option = "plasma") +
  theme_minimal() +
  labs(title = "t-SNE colored by overall survival days",
       x = "t-SNE 1", y = "t-SNE 2")

print(plot_osd)

4.1.13.8 PLOT FOR TUMOR SIZE

plot_tumor <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = `tumor size:ch1`)) +
  geom_point(size = 2, alpha = 0.8) +
  scale_color_viridis_c(option = "plasma") +
  theme_minimal() +
  labs(title = "t-SNE colored by tumor size",
       x = "t-SNE 1", y = "t-SNE 2")

print(plot_tumor)

4.2 Heatmap gene expression of original data to check if standardisation is needed

library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.26.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.16
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================
# Step 1: Calculate variance
gene_var <- apply(BC_data_new, 1, var)
gene_var_filtered <- gene_var[gene_var > 0.01]
top_genes <- names(sort(gene_var_filtered, decreasing = TRUE))[1:500]

# Step 2: Prepare matrix
BC_data_heatmap <- BC_data_new[top_genes, ]
BC_data_heatmap <- as.matrix(BC_data_heatmap)

ht <- Heatmap(
  BC_data_heatmap,
  name = "Expression",
  show_row_names = FALSE,
  show_column_names = FALSE,
  clustering_method_rows = "complete",
  clustering_method_columns = "complete",
  column_title = "Heatmap of top 500 most variable genes",
  row_title = "Genes"
)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
draw(ht)

4.3 Heatmap gene expression of scaled data to check if standardisation is needed

library(ComplexHeatmap)
library(circlize)

BC_data_heatmap_2 <- t(scale(t(BC_data_heatmap)))  # optional scaling

# Step 3: Save to file
ht_2 <- Heatmap(
  BC_data_heatmap_2,
  name = "Expression",
  show_row_names = FALSE,
  show_column_names = FALSE,
  clustering_method_rows = "complete",
  clustering_method_columns = "complete",
  column_title = "Heatmap of top 500 most variable genes",
  row_title = "Genes"
)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
draw(ht_2)

4.4 checking for outliers

4.4.1 Copy dataset

data_out <- merged_data_2

4.4.2 Select variables used in the Cox model for multivariate outlier detection

vars <- data_out[, c("ESR1",
                     "chemo treated:ch1",
                     "overall survival days:ch1",
                     "age at diagnosis:ch1",
                     "tumor size:ch1",
                     "er status:ch1",
                     "her2 status:ch1",
                     "pgr status:ch1")]

4.4.3 Convert all to numeric (factors become numeric 0/1/2…, which is fine for outlier detection)

vars_num <- data.frame(lapply(vars, function(x) as.numeric(as.character(x))))

4.4.4 Keep only rows without NA for covariance calculation

complete_idx <- complete.cases(vars_num)
vars_complete <- vars_num[complete_idx, ]

4.4.5 Compute Mahalanobis distance

cov_matrix <- cov(vars_complete)
center <- colMeans(vars_complete)

mahal_dist <- mahalanobis(vars_complete, center, cov_matrix)

4.4.6 Threshold (p = 0.001)

threshold <- qchisq(0.999, df = ncol(vars_complete))

4.4.7 Local outliers (within complete-case subset)

outliers_local <- which(mahal_dist > threshold)

4.4.8 Re-map to original indices

outliers <- which(complete_idx)[outliers_local]

print(outliers)
##  [1]   92  238  300  398  414  438  445  454  466  478  514  529  553  578  599
## [16]  640  641  737  740  784  786  967 1116 1164 1395 1436 1445 1468 1477 1485
## [31] 1504 1554 1558 1571 1614 1632 1644 1657 1707 1764 1817 1820 1836 1863 1869
## [46] 1892 1897 1899 1949 1964 1992 1994 2004 2011 2020 2022 2040 2100 2111 2123
## [61] 2151 2174 2207 2225 2252 2260 2273 2343 2493 2678 2687 2739 2773 2830 2833
## [76] 2841 2842 2845 2883 2885 2906 2914 2927 2938 2978 3012 3029 3041 3042 3067

4.5 Testing hypothesis with Cox model

no_NA_reduced_merged_data$`chemo treated:ch1`<- as.factor(no_NA_reduced_merged_data$`chemo treated:ch1`)
no_NA_reduced_merged_data$`overall survival event:ch1` <- as.numeric(no_NA_reduced_merged_data$`overall survival event:ch1`)
no_NA_reduced_merged_data$`overall survival days:ch1` <- as.numeric(no_NA_reduced_merged_data$`overall survival days:ch1`)

library(survival)
cox_model <- coxph(Surv(`overall survival days:ch1`, `overall survival event:ch1`) ~ 
                   ESR1 * `chemo treated:ch1` + `age at diagnosis:ch1` + `tumor size:ch1` + `er status:ch1` + `her2 status:ch1` + `pgr status:ch1`, 
                   data = no_NA_reduced_merged_data)
summary(cox_model)
## Call:
## coxph(formula = Surv(`overall survival days:ch1`, `overall survival event:ch1`) ~ 
##     ESR1 * `chemo treated:ch1` + `age at diagnosis:ch1` + `tumor size:ch1` + 
##         `er status:ch1` + `her2 status:ch1` + `pgr status:ch1`, 
##     data = no_NA_reduced_merged_data)
## 
##   n= 2938, number of events= 318 
## 
##                                coef exp(coef)  se(coef)      z Pr(>|z|)    
## ESR1                      -0.117756  0.888912  0.028908 -4.073 4.63e-05 ***
## `chemo treated:ch1`1       0.400522  1.492604  0.249252  1.607   0.1081    
## `age at diagnosis:ch1`     0.067769  1.070118  0.005990 11.313  < 2e-16 ***
## `tumor size:ch1`           0.019055  1.019238  0.002783  6.846 7.60e-12 ***
## `er status:ch1`1           0.340241  1.405287  0.266274  1.278   0.2013    
## `her2 status:ch1`1         0.159112  1.172469  0.170176  0.935   0.3498    
## `pgr status:ch1`1         -0.277404  0.757749  0.203278 -1.365   0.1724    
## ESR1:`chemo treated:ch1`1 -0.110152  0.895698  0.047096 -2.339   0.0193 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                           exp(coef) exp(-coef) lower .95 upper .95
## ESR1                         0.8889     1.1250    0.8399    0.9407
## `chemo treated:ch1`1         1.4926     0.6700    0.9158    2.4328
## `age at diagnosis:ch1`       1.0701     0.9345    1.0576    1.0828
## `tumor size:ch1`             1.0192     0.9811    1.0137    1.0248
## `er status:ch1`1             1.4053     0.7116    0.8339    2.3682
## `her2 status:ch1`1           1.1725     0.8529    0.8399    1.6367
## `pgr status:ch1`1            0.7577     1.3197    0.5087    1.1286
## ESR1:`chemo treated:ch1`1    0.8957     1.1164    0.8167    0.9823
## 
## Concordance= 0.771  (se = 0.014 )
## Likelihood ratio test= 319  on 8 df,   p=<2e-16
## Wald test            = 324.6  on 8 df,   p=<2e-16
## Score (logrank) test = 384.2  on 8 df,   p=<2e-16
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
survival_plot <- ggsurvplot(survfit(cox_model), data = no_NA_reduced_merged_data, risk.table = TRUE)
## Warning in survfit.coxph(cox_model): the model contains interactions; the
## default curve based on columm means of the X matrix is almost certainly not
## useful. Consider adding a newdata argument.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(survival_plot)
## Ignoring unknown labels:
## • colour : "Strata"

summary(cox_model)
## Call:
## coxph(formula = Surv(`overall survival days:ch1`, `overall survival event:ch1`) ~ 
##     ESR1 * `chemo treated:ch1` + `age at diagnosis:ch1` + `tumor size:ch1` + 
##         `er status:ch1` + `her2 status:ch1` + `pgr status:ch1`, 
##     data = no_NA_reduced_merged_data)
## 
##   n= 2938, number of events= 318 
## 
##                                coef exp(coef)  se(coef)      z Pr(>|z|)    
## ESR1                      -0.117756  0.888912  0.028908 -4.073 4.63e-05 ***
## `chemo treated:ch1`1       0.400522  1.492604  0.249252  1.607   0.1081    
## `age at diagnosis:ch1`     0.067769  1.070118  0.005990 11.313  < 2e-16 ***
## `tumor size:ch1`           0.019055  1.019238  0.002783  6.846 7.60e-12 ***
## `er status:ch1`1           0.340241  1.405287  0.266274  1.278   0.2013    
## `her2 status:ch1`1         0.159112  1.172469  0.170176  0.935   0.3498    
## `pgr status:ch1`1         -0.277404  0.757749  0.203278 -1.365   0.1724    
## ESR1:`chemo treated:ch1`1 -0.110152  0.895698  0.047096 -2.339   0.0193 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                           exp(coef) exp(-coef) lower .95 upper .95
## ESR1                         0.8889     1.1250    0.8399    0.9407
## `chemo treated:ch1`1         1.4926     0.6700    0.9158    2.4328
## `age at diagnosis:ch1`       1.0701     0.9345    1.0576    1.0828
## `tumor size:ch1`             1.0192     0.9811    1.0137    1.0248
## `er status:ch1`1             1.4053     0.7116    0.8339    2.3682
## `her2 status:ch1`1           1.1725     0.8529    0.8399    1.6367
## `pgr status:ch1`1            0.7577     1.3197    0.5087    1.1286
## ESR1:`chemo treated:ch1`1    0.8957     1.1164    0.8167    0.9823
## 
## Concordance= 0.771  (se = 0.014 )
## Likelihood ratio test= 319  on 8 df,   p=<2e-16
## Wald test            = 324.6  on 8 df,   p=<2e-16
## Score (logrank) test = 384.2  on 8 df,   p=<2e-16
library(survival)
library(survminer)

# Create ESR1 groups
no_NA_reduced_merged_data$ESR1_group <- ifelse(no_NA_reduced_merged_data$ESR1 > 3, "High ESR1", "Low ESR1")

# Combine ESR1 and chemo into strata
no_NA_reduced_merged_data$chemo <- as.factor(no_NA_reduced_merged_data$`chemo treated:ch1`)
no_NA_reduced_merged_data$strata <- interaction(no_NA_reduced_merged_data$ESR1_group, no_NA_reduced_merged_data$`chemo`)
levels(no_NA_reduced_merged_data$strata) <- c("Low ESR1 + No Chemo", "Low ESR1 + Chemo", "High ESR1 + No Chemo", "High ESR1 + Chemo")

fit <- survfit(Surv(`overall survival days:ch1`, `overall survival event:ch1`) ~ strata, data = no_NA_reduced_merged_data)
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
surv_plot_3 <- ggsurvplot(
  fit,
  data = no_NA_reduced_merged_data,
  risk.table = TRUE,
  risk.table.height = 0.3,            # Increase space for the table
  risk.table.fontsize = 3.5,          # Reduce font size
  risk.table.y.text.col = TRUE,
  risk.table.y.text = FALSE,
  break.time.by = 500,                # Controls x-axis intervals
  xlab = "Time (days)",
  ylab = "Survival Probability",
  title = "Survival by ESR1 Expression and Chemotherapy",
  legend.title = "ESR1 × Chemo",
  legend.labs = levels(no_NA_reduced_merged_data$strata),
  palette = c("#D55E00", "#E69F00", "#56B4E9", "#009E73"),
  size = 1.2,
  linetype = "strata",
  conf.int = TRUE
)

print(surv_plot_3)
## Ignoring unknown labels:
## • colour : "ESR1 × Chemo"